1 Setup

1.1 Load libraries

# string manipulation
library(stringr)
# data wrangling
library(dplyr)
library(tidyr)
library(purrr)
# plot
library(ggplot2)
# write/read text files
library(readr)

In case some libraries are not found, they have to be installed first and then loaded. Here below an example:

install.packages(c("dplyr", "tidyr"))
library(dplyr)
library(tidyr)

1.2 File paths

Specify the directory containing raw files:

files_dir <- "../data/raw/Wos advanced search query results_verwerking/"

Retrieve all input text files:

files_all <- dir(files_dir)
files <- files_all[which(str_detect(files_all, pattern = "txt"))]
files[]
##  [1] "WoS_search_query_data_2011_WE01.txt"
##  [2] "WoS_search_query_data_2011_WE02.txt"
##  [3] "WoS_search_query_data_2011_WE04.txt"
##  [4] "WoS_search_query_data_2011_WE05.txt"
##  [5] "WoS_search_query_data_2011_WE06.txt"
##  [6] "WoS_search_query_data_2011_WE07.txt"
##  [7] "WoS_search_query_data_2011_WE08.txt"
##  [8] "WoS_search_query_data_2011_WE09.txt"
##  [9] "WoS_search_query_data_2011_WE10.txt"
## [10] "WoS_search_query_data_2011_WE11.txt"
## [11] "WoS_search_query_data_2011_WE12.txt"
## [12] "WoS_search_query_data_2011_WE13.txt"
## [13] "WoS_search_query_data_2011_WE14.txt"
## [14] "WoS_search_query_data_2011_WE15.txt"
## [15] "WoS_search_query_data_2012_WE01.txt"
## [16] "WoS_search_query_data_2012_WE02.txt"
## [17] "WoS_search_query_data_2012_WE04.txt"
## [18] "WoS_search_query_data_2012_WE05.txt"
## [19] "WoS_search_query_data_2012_WE06.txt"
## [20] "WoS_search_query_data_2012_WE07.txt"
## [21] "WoS_search_query_data_2012_WE08.txt"
## [22] "WoS_search_query_data_2012_WE09.txt"
## [23] "WoS_search_query_data_2012_WE10.txt"
## [24] "WoS_search_query_data_2012_WE11.txt"
## [25] "WoS_search_query_data_2012_WE12.txt"
## [26] "WoS_search_query_data_2012_WE13.txt"
## [27] "WoS_search_query_data_2012_WE14.txt"
## [28] "WoS_search_query_data_2012_WE15.txt"
## [29] "WoS_search_query_data_2013_WE01.txt"
## [30] "WoS_search_query_data_2013_WE02.txt"
## [31] "WoS_search_query_data_2013_WE04.txt"
## [32] "WoS_search_query_data_2013_WE05.txt"
## [33] "WoS_search_query_data_2013_WE06.txt"
## [34] "WoS_search_query_data_2013_WE07.txt"
## [35] "WoS_search_query_data_2013_WE08.txt"
## [36] "WoS_search_query_data_2013_WE09.txt"
## [37] "WoS_search_query_data_2013_WE10.txt"
## [38] "WoS_search_query_data_2013_WE11.txt"
## [39] "WoS_search_query_data_2013_WE12.txt"
## [40] "WoS_search_query_data_2013_WE13.txt"
## [41] "WoS_search_query_data_2013_WE14.txt"
## [42] "WoS_search_query_data_2013_WE15.txt"
## [43] "WoS_search_query_data_2014_WE01.txt"
## [44] "WoS_search_query_data_2014_WE02.txt"
## [45] "WoS_search_query_data_2014_WE04.txt"
## [46] "WoS_search_query_data_2014_WE05.txt"
## [47] "WoS_search_query_data_2014_WE06.txt"
## [48] "WoS_search_query_data_2014_WE07.txt"
## [49] "WoS_search_query_data_2014_WE08.txt"
## [50] "WoS_search_query_data_2014_WE09.txt"
## [51] "WoS_search_query_data_2014_WE10.txt"
## [52] "WoS_search_query_data_2014_WE11.txt"
## [53] "WoS_search_query_data_2014_WE12.txt"
## [54] "WoS_search_query_data_2014_WE13.txt"
## [55] "WoS_search_query_data_2014_WE14.txt"
## [56] "WoS_search_query_data_2014_WE15.txt"
## [57] "WoS_search_query_data_2015_WE01.txt"
## [58] "WoS_search_query_data_2015_WE02.txt"
## [59] "WoS_search_query_data_2015_WE04.txt"
## [60] "WoS_search_query_data_2015_WE05.txt"
## [61] "WoS_search_query_data_2015_WE06.txt"
## [62] "WoS_search_query_data_2015_WE07.txt"
## [63] "WoS_search_query_data_2015_WE08.txt"
## [64] "WoS_search_query_data_2015_WE09.txt"
## [65] "WoS_search_query_data_2015_WE10.txt"
## [66] "WoS_search_query_data_2015_WE11.txt"
## [67] "WoS_search_query_data_2015_WE12.txt"
## [68] "WoS_search_query_data_2015_WE13.txt"
## [69] "WoS_search_query_data_2015_WE14.txt"
## [70] "WoS_search_query_data_2015_WE15.txt"
## [71] "WoS_search_query_data_2016_WE01.txt"
## [72] "WoS_search_query_data_2016_WE02.txt"
## [73] "WoS_search_query_data_2016_WE04.txt"
## [74] "WoS_search_query_data_2016_WE05.txt"
## [75] "WoS_search_query_data_2016_WE06.txt"
## [76] "WoS_search_query_data_2016_WE07.txt"
## [77] "WoS_search_query_data_2016_WE08.txt"
## [78] "WoS_search_query_data_2016_WE09.txt"
## [79] "WoS_search_query_data_2016_WE10.txt"
## [80] "WoS_search_query_data_2016_WE11.txt"
## [81] "WoS_search_query_data_2016_WE12.txt"
## [82] "WoS_search_query_data_2016_WE13.txt"
## [83] "WoS_search_query_data_2016_WE14.txt"
## [84] "WoS_search_query_data_2016_WE15.txt"

2 Load and tidy data

2.1 Import raw citation data

Import the text files containing the raw output of WOS queries:

raw_WOS_output <- map(files, ~ read_csv(
  str_c(files_dir, ., sep = "/"),
  skip = 3, col_names = FALSE, col_types = cols(X2 = col_character())))

2.2 Tidy data

Analysis will benefit of tidying our data. In tidy data:

  1. Each variable forms a column.
  2. Each observation forms a row.

In our case, year and departement are two important variables.

2.2.1 Add year and departement

Get year and departement identifiers WExx from filenames:

deps <- map_chr(files, ~ 
              str_sub(unlist(
                str_split(., pattern = "_"))[6],
                start = 1, end = 4))
years <- map_chr(files, ~ 
              str_sub(unlist(
                str_split(., pattern = "_"))[5],
                start = 1, end = 4))

Add departement and year to each data frame:

raw_WOS_df <- map2(raw_WOS_output, deps, 
                      function(x, d) mutate(x, dep = d))
raw_WOS_df <- map2(raw_WOS_df, years, 
                      function(x, y) mutate(x, year = y))

2.2.2 Cleaning

Some rows do not contain any relevant information:

raw_WOS_df[[1]] %>% 
  filter(is.na(X2) & is.na(X3))

We remove them from all data frames:

raw_WOS_df <- map(raw_WOS_df, 
                  function(x) filter(x, (!is.na(X2) | !is.na(X3))))

Some data frames contain rows with journal titles in second column instead of third one:

raw_WOS_df[[1]] %>% 
  filter(is.na(as.numeric(X2))) %>%
  filter(!is.na(X2))

In these cases we should copy the content of the second column to the third one. We remove these titles from second column, as it should contain year of publication instead. We also give appropriate names to the columns. We call this list of data frames clean_WOS_dfs:

clean_WOS_dfs <- map(raw_WOS_df, 
                  function(x) 
                    mutate(x, X3 = case_when(
                      is.na(X3) ~ X2,
                      TRUE ~ X3
)))
# change names to columns  & set publication_year as integer
clean_WOS_dfs <- map(clean_WOS_dfs,
                    function(x)
                      select(
                        mutate(x, publication_year = as.numeric(X2), 
                               journal = X3, author = X1),
                        -starts_with("X"))
)

We can now merge all the data frames together, thus creating a complete tidy data frame:

tidy_WOS_df <- bind_rows(clean_WOS_dfs)

Some journals are in lowercase. It means they are not A1 journals. Some examples:

tidy_WOS_df %>%
  filter(!journal == toupper(journal)) %>%
  head(n = 10)

We remove them:

tidy_WOS_df <- tidy_WOS_df %>%
  filter(journal == toupper(journal))

Some preview (randomly picked up) of the tidy data frame:

tidy_WOS_df %>% 
  sample_n(20) %>%
  arrange(dep)

Save the tidy data frame, as it is the base of any further analysis:

write_tsv(tidy_WOS_df, 
          path = "../data/processed/tidy_WOS_df.txt", 
          na = "")

3 Analyze data

3.1 Total number of cited journals

First, we calculate the total number of cited journals per departement per year:

total_n_journals <- tidy_WOS_df %>% 
  group_by(dep, year) %>% 
  # use distinct() in order to not count multiple citations from same journal
  distinct(journal) %>%
  count() %>%
  rename(tot_n_journals = n)
total_n_journals

See graphs below:

More details about changes in number of cited journals for each departement:

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

Maximum and minimum:

## [1] "Maximum: 3660 (dep WE11, year 2015)"
## [1] "Minimum: 107 (dep WE15, year 2016)"

Some stats for each departement:

stats_cited_journals <- total_n_journals %>% 
  group_by(dep) %>%
  summarize(
    mean_journals = as.integer(mean(tot_n_journals)),
    st_dev_journals = as.integer(sd(tot_n_journals)),
    perc_variability = as.integer(st_dev_journals/mean_journals*100)) %>%
  arrange(desc(mean_journals))
  
stats_cited_journals

3.2 Fixed threshold

We first show how many journals have been cited per each departement and year more than x time, with x between 1 and 10:

limit <- 1:10
tot_n_journals <- tidy_WOS_df %>%
  group_by(dep, year, journal) %>%
  count() %>%
  arrange(dep, year, desc(n)) %>%
  ungroup()

more_less_limit <- map(limit, function(x)
  tot_n_journals %>%
    mutate(more_or_less = if_else(n > x,
                                  "+", "- or =")) %>%
    mutate(limit = x))

# merge in a single data frame
more_less_limit <- bind_rows(more_less_limit) %>%
  ungroup()

stat_more_less_limit <- more_less_limit %>%
  group_by(dep, year, limit, more_or_less) %>%
  summarize(n_journals = n()) %>%
  ungroup() %>%
  left_join(total_n_journals, by = c("dep", "year")) %>%
  mutate(perc_n_journals = round(n_journals/tot_n_journals*100))

Stacked histogram:

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

3.3 Relative threshold

By using a fixed threshold the proportion of journals cited more than x times by each departement will be never the same. So, alternatively, we can fix the percentage of cited journals and then we calculate the percentiles based on total number of cited journals by each departement. We use percentiles from 5% to 30% with steps of 5%:

rank_journals <- tot_n_journals %>%
  group_by(dep, year) %>%
  mutate(rank = rank(desc(n), ties.method = "min")) %>%
  ungroup() %>%
  left_join(total_n_journals, by = c("dep","year")) %>%
  mutate(perc_rank = rank / tot_n_journals * 100)
  
  
limit_perc <- seq(5, 30, 5)
names(limit_perc) <- str_c("perc_rank", limit_perc, sep = "_")
n_journals_less <- map_df(limit_perc, function(x)
  rank_journals %>% 
    filter(perc_rank < x) %>%
    group_by(dep, year) %>%
    summarize(percentile = x,
              n_journals_less_perc = n(),
              journals = paste(journal, collapse = ","))) %>%
  ungroup()
n_journals_less

Plot per departement and year:

map(limit_perc, function(x)
      ggplot(n_journals_less %>%
               filter(percentile == x),
             aes(x = dep, y = n_journals_less_perc)) +
      geom_col() +
      facet_wrap(~ year)  + 
      theme(strip.text = element_text(size=12),
        axis.text.x = element_text(angle = 90, hjust = 1, size = 8)))
## $perc_rank_5

## 
## $perc_rank_10

## 
## $perc_rank_15

## 
## $perc_rank_20

## 
## $perc_rank_25

## 
## $perc_rank_30

The column journals contains the journals which are sufficiently cited to be included in this percentile-based threshold. They are also ordered by number of citations (rank). We save this data frame:

write_tsv(n_journals_less, 
          path = "../data/processed/percentile_threshold_journals.txt",
          na = "")